{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, AutoConfig\n",
    "from PIL import Image\n",
    "from train import VLMConfig, VLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "processor = AutoProcessor.from_pretrained(\"/home/user/wyf/siglip-base-patch16-224\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('/home/user/Downloads/Qwen2.5-0.5B-Instruct')\n",
    "AutoConfig.register(\"vlm_model\", VLMConfig)\n",
    "AutoModelForCausalLM.register(VLMConfig, VLM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained('/home/user/wyf/train_multimodal_from_scratch/save/sft')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "q_text = tokenizer.apply_chat_template([{\"role\":\"system\", \"content\":'You are a helpful assistant.'}, {\"role\":\"user\", \"content\":'描述图片内容\\n<image>'}], \\\n",
    "            tokenize=False, \\\n",
    "            add_generation_prompt=True).replace('<image>', '<|image_pad|>'*49)\n",
    "\n",
    "print(q_text)\n",
    "input_ids = tokenizer(q_text, return_tensors='pt')['input_ids']\n",
    "print(input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "image = Image.open('/home/user/wyf/th.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pixel_values = processor(text=None, images=image).pixel_values\n",
    "print(pixel_values.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.eval()\n",
    "import torch\n",
    "from torch.nn import functional as F\n",
    "max_new_tokens = 20\n",
    "temperature = 0.0\n",
    "eos = tokenizer.eos_token_id\n",
    "top_k = None\n",
    "s = input_ids.shape[1]\n",
    "while input_ids.shape[1] < s + max_new_tokens - 1:  \n",
    "    inference_res = model(input_ids, None, pixel_values)  \n",
    "    logits = inference_res.logits \n",
    "    logits = logits[:, -1, :] \n",
    "\n",
    "    for token in set(input_ids.tolist()[0]):  \n",
    "        logits[:, token] /= 1.0\n",
    "\n",
    "    if temperature == 0.0: \n",
    "        _, idx_next = torch.topk(logits, k=1, dim=-1)\n",
    "    else:\n",
    "        logits = logits / temperature  \n",
    "        if top_k is not None:  \n",
    "            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))\n",
    "            logits[logits < v[:, [-1]]] = -float('Inf') \n",
    "\n",
    "        probs = F.softmax(logits, dim=-1)  \n",
    "        idx_next = torch.multinomial(probs, num_samples=1, generator=None)  \n",
    "\n",
    "    if idx_next == eos:  \n",
    "        break\n",
    "\n",
    "    input_ids = torch.cat((input_ids, idx_next), dim=1)  \n",
    "print(input_ids[:, s:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "print(tokenizer.decode(input_ids[:, s:][0]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "wyf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
